The following page is made for anpr flow data exploration purpose. The data is taken from github repo https://github.com/ppintosilva/congestion18tynewear/blob/master/data-raw/events.R. We’ll set-up caching for this notebook given how computationally expensive some of the code we will write can get.
knitr::opts_chunk$set(cache=TRUE)
options(scipen=9999)
rm(list=ls())
library(tidyverse)
library(lubridate)
#library(sf)
library(maotai)
library(ggpubr)
corridor_levels = c(1, 2, 3)
flows <- read_csv(
file = "data/corridor_A184_WEST_3cameras.csv",
col_names = TRUE,
col_types = list(
o = col_integer(),
d = col_integer(),
t = col_datetime(),
flow = col_integer(),
mean_speed = col_double()
)
) %>%
mutate(o = factor(o, levels = corridor_levels),
d = factor(d, levels = corridor_levels))
flows_23_weekday <-
flows %>%
filter(o == 2 & d == 3) %>%
filter(wday(t, week_start = 1) < 6)
p_daily_flow <-
flows_23_weekday %>%
ggplot() +
geom_line(
aes(x = hms::as_hms(t), y = flow, group = as_date(t)),
alpha = .5
) +
scale_x_time(
name = "Time",
breaks = hms::hms(hours = seq(2, 22, 4)),
labels = scales::label_time("%Hh")
) +
theme_bw()
p_daily_flow
flows_23_weekday %>%
group_by(as_date(t)) %>%
summary()
o d t flow mean_speed as_date(t)
1: 0 1: 0 Min. :2018-01-01 00:00:00 Min. : 0.00 Min. : 3.007 Min. :2018-01-01
2:24961 2: 0 1st Qu.:2018-04-02 00:00:00 1st Qu.: 7.00 1st Qu.:30.195 1st Qu.:2018-04-02
3: 0 3:24961 Median :2018-07-02 00:00:00 Median : 48.00 Median :34.933 Median :2018-07-02
Mean :2018-07-01 00:03:03 Mean : 63.88 Mean :34.054 Mean :2018-06-30
3rd Qu.:2018-10-01 00:00:00 3rd Qu.: 93.00 3rd Qu.:40.890 3rd Qu.:2018-10-01
Max. :2018-12-31 00:00:00 Max. :367.00 Max. :65.651 Max. :2018-12-31
NA's :5333
p_daily_mean_speed <-
flows_23_weekday %>%
ggplot() +
geom_line(
aes(x = hms::as_hms(t), y = mean_speed, group = as_date(t)),
alpha = .5
) +
scale_x_time(
name = "Time",
breaks = hms::hms(hours = seq(2, 22, 4)),
labels = scales::label_time("%Hh")
) +
theme_classic()
p_daily_mean_speed
sum(is.na(flows_23_weekday$mean_speed))
[1] 5333
sum(is.na(flows_23_weekday$flow))
[1] 0
expected_flow <-
flows_23_weekday %>%
mutate(time = hms::as_hms(t)) %>%
group_by(o,d,time) %>%
summarise(
median_flow = median(flow)
)
deviation_flow <-
flows_23_weekday %>%
mutate(time = hms::as_hms(t)) %>%
group_by(o, d, time) %>%
summarise(
mad_flow = mad(flow)
)
flow_with_expected <-
flows_23_weekday %>%
mutate(time = hms::as_hms(t)) %>%
inner_join(expected_flow, by = c("o", "d", "time"))
flow_with_expected <-
flow_with_expected %>%
mutate(har_mean_speed = mean_speed - (var(mean_speed, na.rm = TRUE)/mean_speed))
cor.test(flow_with_expected$flow, flow_with_expected$har_mean_speed, method = "pearson")
Pearson's product-moment correlation
data: flow_with_expected$flow and flow_with_expected$har_mean_speed
t = -89.549, df = 19626, p-value < 0.00000000000000022
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.5484381 -0.5285732
sample estimates:
cor
-0.5385805
cor.test(flow_with_expected$flow, flow_with_expected$mean_speed, method = "pearson")
Pearson's product-moment correlation
data: flow_with_expected$flow and flow_with_expected$mean_speed
t = -94.53, df = 19626, p-value < 0.00000000000000022
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.5688786 -0.5496514
sample estimates:
cor
-0.5593403
p_median <- ggplot(flow_with_expected) +
geom_line(aes(x = time, y = flow, group = date(t)), color = "grey") +
geom_line(aes(x = time, y = median_flow), color = "black") +
scale_x_time(
name = "Time",
breaks = hms::hms(hours = seq(2, 24, 4)),
labels = scales::label_time("%H:%M")
) +
theme_bw()
p_median
flow_with_expected %>%
filter(as_date(t) == "2018-10-02") %>%
mutate(flow_diff = abs(flow - median_flow)) %>%
mutate(outlier = ifelse(flow_diff > 40, TRUE, FALSE)) %>%
ggplot() +
geom_line(aes(x = time, y = flow_diff)) +
geom_hline(yintercept = 40, color = "red") +
theme_bw()
p_daily_ecdf <-
flow_with_expected %>%
ggplot() +
stat_ecdf(
aes(x = flow, group = as_date(t)),
alpha = .7
) +
xlab("Vehicle count per time period (15min)") +
ylab("Cumulative probability") +
theme_bw()
p_daily_ecdf
flows_ecd23 <-
flow_with_expected %>%
mutate(dayt = as_date(t)) %>%
group_by(o, d, dayt) %>%
summarise(ecd = list(ecdf(flow))) %>%
group_by(dayt) %>%
mutate(date_index = group_indices()) %>%
group_by(o, d) %>%
mutate(group_id = group_indices())
head(flows_ecd23)
flow_with_expected %>%
filter(as_date(t) == c("2018-04-02", "2018-01-01"))
longer object length is not a multiple of shorter object length
p_01_ecdf <-
flow_with_expected %>%
filter(as_date(t) == c("2018-04-02", "2018-01-02")) %>%
ggplot() +
stat_ecdf(
aes(x = flow, group = as_date(t)),
alpha = .7
) +
theme_bw() +
xlab("Vehicle count per time period (15min)") +
ylab("Cumulative probability")
longer object length is not a multiple of shorter object length
p_01_ecdf
epout_k2 <- flows_ecd23 %>%
group_map(~ { maotai::epmeans(.x$ecd, k = 2) })
epout_k2
[[1]]
[[1]]$cluster
[1] 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2
[53] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 1 1 1 2 1 2 1 2 2 2 2 1 2 2 1 2 2 2 1 2 2
[105] 1 2 2 2 2 2 2 2 2 2 1 2 2 2 1 1 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[157] 2 2 2 2 2 2 1 1 1 2 2 2 1 2 2 2 1 2 2 1 1 1 1 1 1 2 2 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[209] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1
[261] 2
[[1]]$centers
[[1]]$centers[[1]]
Empirical CDF
Call: stats::ecdf(as.vector(tmpcpp[k, ]))
x[1:1000] = 3.8111, 3.9107, 4.0103, ..., 269.91, 270.9
[[1]]$centers[[2]]
Empirical CDF
Call: stats::ecdf(as.vector(tmpcpp[k, ]))
x[1:1000] = 1.3508, 1.3792, 1.4075, ..., 104.71, 105.42
daily_cluster_ids <- epout_k2 %>%
lapply(function(x) x$cluster %>%
enframe(name = "date_index", value = "cluster")) %>%
enframe(name = "group_id", value = "value") %>%
unnest(value)
head(daily_cluster_ids)
flow_80quantiles <- flows_23_weekday %>%
group_by(o,d) %>%
summarise(quantile80 = quantile(flow, 0.8))
Assume that the centroids which corresponds to “typical” traffic is the one who carries more traffic most of the times, i.e. will have lower cummulative probability of carrying less or equal than 80th percentile of the flow
ecd_centroids_k2 <- epout_k2 %>%
lapply(function(x) x$centers %>% enframe(name = "cluster", value = "centroid")) %>%
enframe(name = "group_id", value = "value") %>%
unnest(value) %>%
inner_join(flows_ecd23 %>% distinct(o,d) %>% mutate(group_id = group_indices()),
by = "group_id") %>%
select(-group_id) %>%
select(o, d, cluster, centroid) %>%
# label which centroid is typical and atypical
# for a high quantile (e.g. 80% quantile)
inner_join(flow_80quantiles, by = c("o", "d")) %>%
group_by(o, d, cluster) %>%
mutate(prob80 = centroid[[1]](quantile80)) %>%
group_by(o, d) %>%
arrange(prob80) %>%
mutate(cluster_label = c("typical", "atypical")) %>%
mutate(cluster_label = factor(cluster_label)) %>%
arrange(o, d, prob80)
max_flow <- max(flows$flow)
npoints = 500
ecd_centroids_k2_xy <-
ecd_centroids_k2 %>%
group_by(o, d, cluster) %>%
group_modify(~{
tibble(
cluster_label = .$cluster_label,
ecd_x = seq(0, max_flow, length.out = npoints)
) %>%
mutate(ecd_y = .x$centroid[[1]](ecd_x))
})
od_day_labels <- flows_ecd23 %>%
inner_join(daily_cluster_ids, by = c("group_id", "date_index")) %>%
select(-c(date_index, group_id, ecd)) %>%
inner_join(
ecd_centroids_k2 %>% distinct(o, d, cluster, cluster_label),
by = c("o", "d", "cluster")
)
flows_23_labelled <-
flows_23_weekday %>%
mutate(dayt = as_date(t)) %>%
mutate(month = month(t)) %>%
inner_join(od_day_labels %>% select (-cluster), by = c("o", "d", "dayt"))
flows_23_labelled[flows_23_labelled$month == 5,]
p_all_clustered_ecdf <-
flows_23_labelled %>%
mutate(tday = factor(as_date(t))) %>%
ggplot() +
stat_ecdf(
aes(x = flow, group = tday, colour = cluster_label),
alpha = .6
) +
geom_line(
data = ecd_centroids_k2_xy,
mapping = aes(x = ecd_x, y = ecd_y, colour = cluster_label),
size = 2
) +
geom_vline(
xintercept = ecd_centroids_k2$quantile80,
linetype = "dotted",
size = 1.0
) +
geom_hline(
yintercept = ecd_centroids_k2$prob80,
linetype = "dashed",
size = 1.0
) +
scale_color_grey(name = "Daily behaviour") +
theme_bw() +
xlab("Vehicle count per time period (15min)") +
ylab("Cumulative probability")
p_all_clustered_ecdf
p_test <-
flows_23_labelled %>%
mutate(tday = factor(as_date(t))) %>%
ggplot() +
stat_ecdf(
aes(x = flow, group = tday, colour = cluster_label),
alpha = .6
) +
geom_line(
data = ecd_centroids_k2_xy,
mapping = aes(x = ecd_x, y = ecd_y, colour = cluster_label),
size = 2
)
p_test
p_daily_flow_labelled <-
flows_23_labelled %>%
ggplot() +
geom_line(
aes(x = hms::as_hms(t), y = flow, group = as_date(t)), alpha = .5
) +
scale_x_time(
name = "Time",
breaks = hms::hms(hours = seq(2,22,4)),
labels = scales::label_time("%Hh")
) +
facet_wrap(~cluster_label) +
theme_bw()
p_daily_flow_labelled
p_daily_speed_labelled <-
flows_23_labelled %>%
ggplot() +
geom_line(
aes(x = hms::as_hms(t), y = mean_speed, group = as_date(t)), alpha = .5
) +
scale_x_time(
name = "Time",
breaks = hms::hms(hours = seq(2,22,4)),
labels = scales::label_time("%Hh")
) +
facet_wrap(~cluster_label) +
theme_bw()
p_daily_speed_labelled
x <- flows_23_labelled %>% filter(flow, cluster_label == "typical")
y <- flows_23_labelled %>% filter(mean_speed, cluster_label == "typical")
a <- flows_23_labelled %>% filter(flow, cluster_label == "atypical")
b <- flows_23_labelled %>% filter(mean_speed, cluster_label == "atypical")
cor.test(x$flow, y$mean_speed, method = "pearson")
Pearson's product-moment correlation
data: x$flow and y$mean_speed
t = -93.801, df = 12641, p-value < 0.00000000000000022
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.6507799 -0.6302219
sample estimates:
cor
-0.6406157
cor.test(a$flow, b$mean_speed, method = "pearson")
Pearson's product-moment correlation
data: a$flow and b$mean_speed
t = -25.772, df = 6983, p-value < 0.00000000000000022
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.3159789 -0.2731468
sample estimates:
cor
-0.2947109
flows_23_atypical <- flows_23_labelled[flows_23_labelled$cluster_label == 'atypical',]
flows_23_typical <- flows_23_labelled[flows_23_labelled$cluster_label == 'typical',]
flows_23_atypical$time <- hms::as_hms(flows_23_atypical$t)
flows_23_atypical_mean <- aggregate(flows_23_atypical[,4], list(as.character(flows_23_atypical$time)), mean)
flows_23_typical$time <- hms::as_hms(flows_23_typical$t)
flows_23_typical_mean <- aggregate(flows_23_typical[,4], list(as.character(flows_23_typical$time)), mean)
flows_23_mean_combined <- data.frame("t"=flows_23_atypical_mean$Group.1, "flow_atypical"=flows_23_atypical_mean$flow, "flow_typical"=flows_23_typical_mean$flow)
# flows_23_mean_combined$t <- as.character(flows_23_mean_combined$t)
# flows_23_mean_combined$t <- chron::as.times(flows_23_mean_combined$t)
flows_23_mean_combined$t <- as.POSIXct(flows_23_mean_combined$t, format = "%H:%M:%S")
flow_23_mean_compare <- ggplot(flows_23_mean_combined, aes(x = t)) +
geom_line(aes(y = flow_atypical), colour = "red") +
geom_line(aes(y = flow_typical), colour= "green") +
scale_x_datetime(date_labels = "%H:%M") +
theme_bw()
flow_23_mean_compare
expected_flow_typ <-
flows_23_typical %>%
mutate(time = hms::as_hms(t)) %>%
#filter(!month(t) %in% c(3,4)) %>%
group_by(o,d,time) %>%
summarise(
median_flow_typ = median(flow)
)
deviation_flow_typ <-
flows_23_typical %>%
mutate(time = hms::as_hms(t)) %>%
group_by(o, d, time) %>%
summarise(
mad_flow_typ = mad(flow)
)
flow_with_expected <- inner_join(flow_with_expected, expected_flow_typ, by = c("o", "d", "time"))
expected_flow_atyp <-
flows_23_atypical %>%
mutate(time = hms::as_hms(t)) %>%
#filter(!month(t) %in% c(3,4)) %>%
group_by(o,d,time) %>%
summarise(
median_flow_atyp = median(flow)
)
expected_flow_atyp_real <-
flows_23_atypical %>%
filter(flow != 0) %>%
group_by(o, d, time) %>%
summarise(
median_flow_atyp_real = median(flow)
)
deviation_flow_atyp <-
flows_23_atypical %>%
mutate(time = hms::as_hms(t)) %>%
group_by(o, d, time) %>%
summarise(
mad_flow_atyp = mad(flow)
)
flow_with_expected <- inner_join(flow_with_expected, expected_flow_atyp, by = c("o", "d", "time"))
flow_with_expected <- inner_join(flow_with_expected, expected_flow_atyp_real, by = c("o", "d", "time"))
p_median_ep <- ggplot(flow_with_expected) +
geom_line(aes(x = time, y = flow, group = date(t)), color = "grey") +
geom_line(aes(x = time, y = median_flow_typ), color = "black") +
scale_x_time(
name = "Time",
breaks = hms::hms(hours = seq(2, 24, 4)),
labels = scales::label_time("%H:%M")
) +
theme_bw()
p_median_ep
p_median_ep_atyp <- ggplot(flow_with_expected) +
geom_line(aes(x = time, y = flow, group = date(t)), color = "grey") +
geom_line(aes(x = time, y = median_flow_atyp), color = "black") +
scale_x_time(
name = "Time",
breaks = hms::hms(hours = seq(2, 24, 4)),
labels = scales::label_time("%H:%M")
) +
theme_bw()
p_median_ep
p_median_comp <-
ggplot(flow_with_expected) +
geom_line(
aes(x = time, y = flow, group = date(t)),
color = "grey"
) +
geom_line(
aes(x = time, y = median_flow, group = date(t)),
linetype = "solid"
) +
geom_line(
aes(x = time, y = median_flow_typ, group = date(t)),
linetype = "dotted"
) +
geom_line(
aes(x = time, y = median_flow_atyp, group = date(t)),
linetype = "dashed"
) +
theme_bw()
p_median_comp
p_atypical_comp <-
ggplot(flow_with_expected) +
geom_line(
aes(
x = time,
y = median_flow_atyp
),
linetype = "solid"
) +
geom_line(
aes(
x = time,
y = median_flow_atyp_real
),
linetype = "dashed"
) +
theme(legend.position = "top") +
xlab("Time") +
ylab("Atypical Median") +
theme_bw()
p_atypical_comp
ggarrange(p_median, p_median_ep,
ncol = 3, nrow = 1)
ggplot() +
geom_line(
data = deviation_flow,
aes(x = time, y = mad_flow),
linetype = "solid"
) +
geom_line(
data = deviation_flow_typ,
aes(x = time, y = mad_flow_typ),
linetype = "dotted"
) +
geom_line(
data = deviation_flow_atyp,
aes(x = time, y = mad_flow_atyp),
linetype = "dashed"
) +
theme_bw()
flows_23_typical %>% filter(month == 6) %>%
ggplot() +
geom_point(aes(
x = mean_speed,
y = flow
)) +
theme_bw()
quad_fit <- lm(formula = flow ~ poly(mean_speed, 2, raw = TRUE), data = flows_23_weekday)
summary(quad_fit)
Call:
lm(formula = flow ~ poly(mean_speed, 2, raw = TRUE), data = flows_23_weekday)
Residuals:
Min 1Q Median 3Q Max
-170.800 -29.449 -3.577 24.347 199.498
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 174.295536 2.855249 61.044 < 0.0000000000000002 ***
poly(mean_speed, 2, raw = TRUE)1 -0.659595 0.194332 -3.394 0.00069 ***
poly(mean_speed, 2, raw = TRUE)2 -0.056618 0.003224 -17.561 < 0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 54.85 on 19625 degrees of freedom
(5333 observations deleted due to missingness)
Multiple R-squared: 0.3235, Adjusted R-squared: 0.3234
F-statistic: 4692 on 2 and 19625 DF, p-value: < 0.00000000000000022
quad_eq <- quad_fit$coefficient[3]*flows_23_weekday$mean_speed^2 + quad_fit$coefficient[2]*flows_23_weekday$mean_speed + quad_fit$coefficient[1]
quad_eq <- as.data.frame(quad_eq)
quad_fit_plot <- flows_23_weekday %>%
select(flow, mean_speed) %>%
cbind(quad_eq)
quad_fit_plot
p_quad_fit <-
ggplot(quad_fit_plot) +
# geom_point(
# aes(x = mean_speed, y = flow)
# ) +
geom_line(
aes(x = mean_speed, y = quad_eq)
)
p_quad_fit
quad_fit_typical <- lm(formula = flow ~ poly(mean_speed, 2, raw = TRUE), data = flows_23_typical)
summary(quad_fit_typical)
Call:
lm(formula = flow ~ poly(mean_speed, 2, raw = TRUE), data = flows_23_typical)
Residuals:
Min 1Q Median 3Q Max
-200.863 -31.218 -1.794 31.615 169.420
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 203.955506 3.378576 60.367 <0.0000000000000002 ***
poly(mean_speed, 2, raw = TRUE)1 -0.466305 0.237505 -1.963 0.0496 *
poly(mean_speed, 2, raw = TRUE)2 -0.076346 0.004023 -18.978 <0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 56.43 on 12640 degrees of freedom
(605 observations deleted due to missingness)
Multiple R-squared: 0.4267, Adjusted R-squared: 0.4266
F-statistic: 4704 on 2 and 12640 DF, p-value: < 0.00000000000000022
quad_fit_atypical <- lm(formula = flow ~ poly(mean_speed, 2, raw = TRUE), data = flows_23_atypical)
summary(quad_fit_atypical)
Call:
lm(formula = flow ~ poly(mean_speed, 2, raw = TRUE), data = flows_23_atypical)
Residuals:
Min 1Q Median 3Q Max
-73.385 -21.995 0.234 19.176 249.933
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 15.468279 3.743698 4.132 0.0000364 ***
poly(mean_speed, 2, raw = TRUE)1 4.851603 0.238049 20.381 < 0.0000000000000002 ***
poly(mean_speed, 2, raw = TRUE)2 -0.099861 0.003753 -26.610 < 0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 34.67 on 6982 degrees of freedom
(4728 observations deleted due to missingness)
Multiple R-squared: 0.1709, Adjusted R-squared: 0.1707
F-statistic: 719.8 on 2 and 6982 DF, p-value: < 0.00000000000000022
# Pearson Correlation test entire for entire traffics
cor.test(flows_23_weekday$flow, flows_23_weekday$mean_speed, method = "pearson")
Pearson's product-moment correlation
data: flows_23_weekday$flow and flows_23_weekday$mean_speed
t = -94.53, df = 19626, p-value < 0.00000000000000022
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.5688786 -0.5496514
sample estimates:
cor
-0.5593403
# Pearson Correlation test entire for typical traffics
cor.test(flows_23_typical$flow, flows_23_typical$mean_speed, method = "pearson")
Pearson's product-moment correlation
data: flows_23_typical$flow and flows_23_typical$mean_speed
t = -93.801, df = 12641, p-value < 0.00000000000000022
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.6507799 -0.6302219
sample estimates:
cor
-0.6406157
# Pearson Correlation test entire for atypical traffics
cor.test(flows_23_atypical$flow, flows_23_atypical$mean_speed, method = "pearson")
Pearson's product-moment correlation
data: flows_23_atypical$flow and flows_23_atypical$mean_speed
t = -25.772, df = 6983, p-value < 0.00000000000000022
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.3159789 -0.2731468
sample estimates:
cor
-0.2947109
flows_23_2jan <-
flow_with_expected %>%
filter(as_date(t) == "2018-01-02") %>%
mutate(quantile25 = quantile(flow, 0.25)) %>%
mutate(start = hms::as_hms(t)) %>%
mutate(end = hms::as_hms(start + hms::as_hms("00:15:00"))) %>%
mutate(flow_diff = abs(flow - median_flow)) %>%
mutate(condition = ifelse(flow_diff > quantile25, "ABNORMAL", "NORMAL"))
p_clust_2jan <-
ggplot(flows_23_2jan) +
geom_line(
aes(
x = start,
y = flow
)
) +
geom_rect(
aes(
xmin = start,
xmax = end,
fill = condition
),
ymin = -Inf,
ymax = Inf,
alpha = 0.5
) +
scale_fill_manual(values = c("grey", NA)) +
xlab("Time") +
ylab("Flow") +
theme_bw()
p_clust_2jan
sum(flows_23_2jan$condition == "ABNORMAL")/length(flows_23_2jan$condition)
[1] 0.3645833
nrow(flows_23_weekday[flows_23_weekday$flow == 0,])/nrow(flows_23_weekday[flows_23_weekday$flow != 0,])*100
[1] 27.17037
expected_typical <-
flows_23_labelled %>%
filter(cluster_label == "typical") %>%
mutate(time = hms::as_hms(t)) %>%
group_by(o, d, time) %>%
summarise(
median_typical = median(flow)
)
flow_with_expected <- inner_join(flow_with_expected, expected_typical, by = c("o", "d", "time"))
flows_23_2jan_typ <-
flow_with_expected %>%
filter(as_date(t) == "2018-01-02") %>%
mutate(quantile25 = quantile(flow, 0.25)) %>%
mutate(start = hms::as_hms(t)) %>%
mutate(end = hms::as_hms(start + hms::as_hms("00:15:00"))) %>%
mutate(flow_diff = abs(flow - median_typical)) %>%
mutate(condition = ifelse(flow_diff > quantile25, "ABNORMAL", "NORMAL"))
p_clust_2jan_typ <-
ggplot(flows_23_2jan_typ) +
geom_line(
aes(
x = start,
y = flow
)
) +
geom_rect(
aes(
xmin = start,
xmax = end,
fill = condition
),
ymin = -Inf,
ymax = Inf,
alpha = 0.5
) +
scale_fill_manual(values = c("grey", NA)) +
xlab("Time") +
ylab("Flow") +
theme_bw()
p_clust_2jan_typ
sum(flows_23_2jan_typ$condition == "ABNORMAL")/length(flows_23_2jan_typ$condition)
[1] 0.375
flows_23_10jul <-
flow_with_expected %>%
filter(as_date(t) == "2018-07-10") %>%
mutate(quantile25 = quantile(flow, 0.25)) %>%
mutate(start = hms::as_hms(t)) %>%
mutate(end = hms::as_hms(start + hms::as_hms("00:15:00"))) %>%
mutate(flow_diff = abs(flow - median_flow)) %>%
mutate(condition = ifelse(flow_diff > quantile25, "ABNORMAL", "NORMAL"))
flows_23_10jul_typ <-
flow_with_expected %>%
filter(as_date(t) == "2018-07-10") %>%
mutate(quantile25 = quantile(flow, 0.25)) %>%
mutate(start = hms::as_hms(t)) %>%
mutate(end = hms::as_hms(start + hms::as_hms("00:15:00"))) %>%
mutate(flow_diff = abs(flow - median_typical)) %>%
mutate(condition = ifelse(flow_diff > quantile25, "ABNORMAL", "NORMAL"))
p_clust_10jul <-
ggplot(flows_23_10jul) +
geom_line(
aes(
x = start,
y = flow
)
) +
geom_rect(
aes(
xmin = start,
xmax = end,
fill = condition
),
ymin = -Inf,
ymax = Inf,
alpha = 0.5
) +
scale_fill_manual(values = c("grey", NA)) +
xlab("Time") +
ylab("Flow") +
theme_bw()
p_clust_10jul
sum(flows_23_10jul$condition == "ABNORMAL")/length(flows_23_10jul$condition)
[1] 0.3333333
p_clust_10jul_typ <-
ggplot(flows_23_10jul_typ) +
geom_line(
aes(
x = start,
y = flow
)
) +
geom_rect(
aes(
xmin = start,
xmax = end,
fill = condition
),
ymin = -Inf,
ymax = Inf,
alpha = 0.5
) +
scale_fill_manual(values = c("grey", NA)) +
xlab("Time") +
ylab("Flow") +
theme_bw()
p_clust_10jul_typ
sum(flows_23_10jul_typ$condition == "ABNORMAL")/length(flows_23_10jul_typ$condition)
[1] 0.3333333
fig1 <- ggarrange(p_clust_2jan, p_clust_2jan_typ,
labels = c("A", "B"),
ncol = 1, nrow = 2)
annotate_figure(fig1,
top = text_grob("Abnormal flow detection on 2 January 2018", color = "Black", face = "bold", size = 12))
fig2 <- ggarrange(p_clust_10jul, p_clust_10jul_typ,
labels = c("A", "B"),
ncol = 1, nrow = 2)
annotate_figure(fig2,
top = text_grob("Abnormal flow detection on 10 July 2018", color = "Black", face = "bold", size = 12))
flows_23_atypical_real <-
flows_23_atypical %>%
filter(flow != 0)
flows_23_atypical_real %>%
filter(flow == 1)
flows_23_atypical_real %>%
filter(dayt == "2018-04-24")